Results: Side by Side comparison

library(fields)
library(readr)
library(tidyverse, warn.conflicts = FALSE)
library(data.table)
library(dplyr, warn.conflicts = FALSE)
library(ggplot2)
library(tidyr)
library(mvtnorm)
library(ParBayesianOptimization)
library(cowplot)
library(tibble)
source("SCRIPTS/outlier_detection_helper.R")

Nelder-Mead (per org) versus Nelder-Mead (per org, year pair)

Grid versus Nelder-Mead, per org per year

head(best_combos |> select(-likelihood.grid, -likelihood.CNM, -sigma.squared.grid, -sigma.squared.CNM), 20)
##               EIN year nu.grid nugget.grid nu.CNM nugget.CNM
## 1  EIN-04-2503758    0   0.190         0.1 0.1342     0.0000
## 2  EIN-04-2503758    1   0.190         0.2 0.1281     0.0000
## 3  EIN-04-2503758    2   0.190         0.2 0.1146     0.0000
## 4  EIN-04-2503758    3   0.190         0.2 0.1183     0.0000
## 5  EIN-04-2503758    4   0.190         0.1 0.1358     0.0000
## 6  EIN-04-2503758    6   0.190         0.0 0.1788     0.0000
## 7  EIN-04-2503758    7   0.355         0.0 0.4086     0.0000
## 8  EIN-04-2503758    8   0.190         0.0 0.1835     0.0000
## 9  EIN-04-2503758    9   0.190         0.2 0.1230     0.0000
## 10 EIN-04-2503758   10   0.190         0.2 0.1269     0.0000
## 11 EIN-04-2503758   11   0.190         0.2 0.1060     0.0000
## 12 EIN-04-2503758   12   0.190         0.3 0.0887     0.0000
## 13 EIN-04-2503758   13   0.190         0.2 0.1194     0.0000
## 14 EIN-13-2720896    0   0.520         0.0 1.2678     0.0051
## 15 EIN-13-2720896    1   0.685         0.0 0.9150     0.0045
## 16 EIN-13-2720896    2   0.685         0.0 1.0762     0.0046
## 17 EIN-13-2720896    3   0.520         0.0 1.2658     0.0040
## 18 EIN-13-2720896    4   0.520         0.0 1.2709     0.0041
## 19 EIN-13-2720896    5   0.520         0.0 1.3145     0.0039
## 20 EIN-13-2720896    6   0.520         0.0 1.2140     0.0043
print(paste("Out of ", nrow(best_combos), " org-year pairs, constrained Nelder-Mead got the higher likelihood (compared to grid search) for ", sum(best_combos$likelihood.grid <= best_combos$likelihood.CNM), " pairs.", sep = ""))
## [1] "Out of 600 org-year pairs, constrained Nelder-Mead got the higher likelihood (compared to grid search) for 596 pairs."

Grid versus Nelder-Mead - per year ONLY

Grid versus Bayes - per year ONLY

Trouble-Shooting

Selected plots

Ranges (Bayesian Optimization)

  • \(\nu \in [0.025, 2.5]\)
  • \(\tau^2 \in [0, 2.5]\)
  • \(\sigma^2 \in [0.025, 1.5]\)

Ranges (Grid Search Optimization)

  • \(\nu \in [0.05, 2.5]\)
  • \(\tau^2 \in [0, 2]\)
  • \(\sigma^2 \in [0.05, 0.95]\)

All plots

Hyperparameter Tuning Code - one best set of params per org per year

Nelder Mead

Non parallel version

Hyperparameter Tuning Code - one best set of params per org

Nelder Mead

res <- readRDS("MODEL/outlier_detection/constrainNM_results.rds")
rownames(res) <- res[,"EIN"]
res <- res |> mutate(sigma.squared = 0)

all_predictions = list()
all_candidates = list()
all_se = list()

mu_1 <- 0
# Looping over all organizations
start.time <- Sys.time()
for (ein in unique(df$EIN2)){
      df.sub <- filter(df, EIN2==ein)
      all_years <- df.sub$YEAR + 1 # all the years we have data for that org
      mu_2 <- numeric(nrow(df.sub) - 1) # all zeros
      
      predictions <- numeric(length(all_years)) # initialize
      standard_errors <- numeric(length(all_years)) # initialize
      
      # Generate Matern covariance matrix using "best" hyperparameters
      mat_cov <- Matern(d = dist_mat, 
                        smoothness = res[ein,]$nu, 
                        range = rho, 
                        phi = 1) + diag(res[ein,]$nugget, dim(dist_mat)[1])
      
      sigma.squared <- (1/length(df.sub$LOG_REV)) * (df.sub$LOG_REV %*% solve(mat_cov[all_years, all_years]) %*% df.sub$LOG_REV)[1,1]
      mat_cov <- sigma.squared * mat_cov
      
      res[ein,]$sigma.squared <- sigma.squared
      for(i in 1:length(all_years)){
            # Leave out i-th year
            log_revenue <- (df |> filter(EIN2 == ein & YEAR != all_years[i]-1))$LOG_REV
            
            SIGMA.11 <- mat_cov[all_years[i], all_years[i]]
            SIGMA.22.inv <- solve(mat_cov[all_years[-i], all_years[-i]])
            SIGMA.12 <- mat_cov[all_years[i], all_years[-i], drop = FALSE]
            
            predictions[i] <- mu_1 + (SIGMA.12 %*% SIGMA.22.inv %*% (log_revenue - mu_2)) #mu_bar
            standard_errors[i] <- sqrt(as.numeric(SIGMA.11 - (SIGMA.12 %*% SIGMA.22.inv %*% t(SIGMA.12))))
      }
      
      residuals <- abs(((df |> filter(EIN2 == ein))$LOG_REV - predictions) / standard_errors) # standardized residuals
      
      all_predictions[[ein]] <- predictions
      all_se[[ein]] <- standard_errors
      all_candidates[[ein]] <-  all_years[which(residuals > 3)] - 1 + 1989
      
      p <- plot_true_vs_pred(df = df, 
                             ein = ein, 
                             predictions = predictions, 
                             standard_errors = standard_errors, 
                             all_years = all_years, 
                             candidate_outliers = all_candidates[ein], 
                             combo = res[ein,], label = ein)
      print(p)
}

print(Sys.time() - start.time)

Bayesian Optimization

df <- as.data.table(read_csv("small_test.csv", show_col_types = FALSE)) 

# center the data
df <- df |>
      group_by(EIN2) |>
      mutate(LOG_REV = LOG_REV - mean(LOG_REV)) |>
      ungroup()

# distance matrix of years
dist_mat <- dist(sort(unique(df$YEAR)), diag=TRUE, upper=TRUE)
dist_mat <- as.matrix(dist_mat)

# normalize
dist_mat <- (dist_mat - min(dist_mat)) / (max(dist_mat) - min(dist_mat))
bounds <- list(nu = c(0.025,2.5), nugget = c(0,2.5), sigma.squared = c(0.025, 1.5))
opt_info <- list()

start.time <- Sys.time() # about two minutes per org / took about 50 minutes for whole dataset
for (ein in unique(df$EIN2)){
      print(ein)
      df.subset <- df |> filter(EIN2 == ein)
      
      # need to define different wrapper function per org to pass to bayesOpt since data and years differs org to org
      get_likelihood_bayesopt_wrapper <- function(nu, nugget, sigma.squared) {
            return(list(Score = get_likelihood_bayesopt(distance.matrix = dist_mat, 
                                     data = df.subset$LOG_REV, 
                                     years = df.subset$YEAR + 1, 
                                     nu, nugget, sigma.squared)))
      } 
      
      opt_info[[ein]] <- bayesOpt(FUN = get_likelihood_bayesopt_wrapper,
                                     bounds = bounds,
                                     initPoints = 5,
                                     iters.n = 50,
                                     plotProgress = FALSE,
                                     verbose = 0)
}
print(Sys.time() - start.time)
all_predictions = list()
all_candidates = list()
all_se = list()
all_best_params = list()

mu_1 <- 0
# Looping over all organizations
start.time <- Sys.time()
for (ein in unique(df$EIN2)){
      all_years <- (df |> filter(EIN2 == ein))$YEAR + 1 # all the years we have data for that org
      mu_2 <- numeric(nrow(df |> filter(EIN2 == ein)) - 1) # all zeros
      
      predictions <- numeric(length(all_years)) # initialize
      standard_errors <- numeric(length(all_years)) # initialize
      
      best.pars <- getBestPars(opt_info[[ein]])
      all_best_params[[ein]] <- best.pars
      
      # Generate Matern covariance matrix using "best" hyperparameters
      mat_cov <- Matern(d = dist_mat, 
                        smoothness = best.pars$nu, 
                        range = 1, 
                        phi = best.pars$sigma.squared) + diag(best.pars$nugget, dim(dist_mat)[1])
      
      # Train different model per org leave-one-out style
      for(i in 1:length(all_years)){
            # Leave out i-th year
            log_revenue <- (df |> filter(EIN2 == ein & YEAR != all_years[i]-1))$LOG_REV
            
            SIGMA.11 <- mat_cov[all_years[i], all_years[i]]
            SIGMA.22.inv <- solve(mat_cov[all_years[-i], all_years[-i]])
            SIGMA.12 <- mat_cov[all_years[i], all_years[-i], drop = FALSE]
            
            predictions[i] <- mu_1 + (SIGMA.12 %*% SIGMA.22.inv %*% (log_revenue - mu_2)) #mu_bar
            standard_errors[i] <- sqrt(as.numeric(SIGMA.11 - (SIGMA.12 %*% SIGMA.22.inv %*% t(SIGMA.12))))
      }
      
      residuals <- abs(((df |> filter(EIN2 == ein))$LOG_REV - predictions) / standard_errors) # standardized residuals
      
      all_predictions[[ein]] <- predictions
      all_se[[ein]] <- standard_errors
      all_candidates[[ein]] <-  all_years[which(residuals > 3)] - 1 + 1989
      
      p <- plot_true_vs_pred(df, ein, predictions, standard_errors, all_years, all_candidates[ein], best.pars)
      print(p)
}

print(Sys.time() - start.time)

Grid Search

df <- as.data.table(read_csv("small_test.csv", show_col_types = FALSE)) 

# center the data
df <- df |>
      group_by(EIN2) |>
      mutate(LOG_REV = LOG_REV - mean(LOG_REV)) |>
      ungroup()

# distance matrix of years
dist_mat <- dist(sort(unique(df$YEAR)), diag=TRUE, upper=TRUE)
dist_mat <- as.matrix(dist_mat)

# normalize
min_val <- min(dist_mat)
max_val <- max(dist_mat)
dist_mat <- (dist_mat - min_val) / (max_val - min_val)

# Set up ranges for Matern covariance matrix hyperparameters
rho <- 1

nu_vec <- seq(from = 0.05, to = 2.5, by = 0.245)
nugget_vec <- seq(from = 0, to = 2, by = 0.2)
sigma_squared_vec <- seq(from = 0.05, to = 0.95, by = 0.1)

# All possible combinations of hyperparameters
combos <- expand.grid(nu = nu_vec, nugget = nugget_vec, sigma.squared = sigma_squared_vec)

# Grid Search: Looping over all organizations to find best set of hyperparameters
max_likelihoods = list()
max_likelihoods_idx = list()
all_likelihoods = list()

start.time <- Sys.time()
for (ein in unique(df$EIN2)){
      all_years <- (df |> filter(EIN2 == ein))$YEAR + 1 # all the years we have data for that org
      
      # get the likelihood for each combination of parameters
      likelihoods <- apply(combos, 
                           MARGIN = 1, 
                           function(row) return(get_likelihood(row,
                                          dist_mat = dist_mat,
                                          rho = rho,
                                          all_years = all_years,
                                          data = (df |> filter(EIN2 == ein))$LOG_REV))
                           )
      all_likelihoods[[ein]] <- likelihoods
      max_likelihoods[[ein]] <- max(likelihoods)
      max_likelihoods_idx[[ein]] <- which.max(likelihoods)
}

print(Sys.time() - start.time)

best_combos <- combos[as.vector(unlist(max_likelihoods_idx)),]
row.names(best_combos) <- names(max_likelihoods_idx)

best_combos
all_predictions = list()
all_candidates = list()
all_se = list()

mu_1 <- 0
# Looping over all organizations
start.time <- Sys.time()
for (ein in unique(df$EIN2)){
      all_years <- (df |> filter(EIN2 == ein))$YEAR + 1 # all the years we have data for that org
      mu_2 <- numeric(nrow(df |> filter(EIN2 == ein)) - 1) # all zeros
      
      predictions <- numeric(length(all_years)) # initialize
      standard_errors <- numeric(length(all_years)) # initialize
      
      # Generate Matern covariance matrix using "best" hyperparameters
      mat_cov <- Matern(d = dist_mat, 
                        smoothness = best_combos[ein,1], 
                        range = rho, 
                        phi = best_combos[ein,3]) + diag(best_combos[ein,2], dim(dist_mat)[1])
      
      for(i in 1:length(all_years)){
            # Leave out i-th year
            log_revenue <- (df |> filter(EIN2 == ein & YEAR != all_years[i]-1))$LOG_REV
            
            SIGMA.11 <- mat_cov[all_years[i], all_years[i]]
            SIGMA.22.inv <- solve(mat_cov[all_years[-i], all_years[-i]])
            SIGMA.12 <- mat_cov[all_years[i], all_years[-i], drop = FALSE]
            
            predictions[i] <- mu_1 + (SIGMA.12 %*% SIGMA.22.inv %*% (log_revenue - mu_2)) #mu_bar
            standard_errors[i] <- sqrt(as.numeric(SIGMA.11 - (SIGMA.12 %*% SIGMA.22.inv %*% t(SIGMA.12))))
      }
      
      residuals <- abs(((df |> filter(EIN2 == ein))$LOG_REV - predictions) / standard_errors) # standardized residuals
      
      all_predictions[[ein]] <- predictions
      all_se[[ein]] <- standard_errors
      all_candidates[[ein]] <-  all_years[which(residuals > 3)] - 1 + 1989
      
      p <- plot_true_vs_pred(df, ein, predictions, standard_errors, all_years, all_candidates[ein], best_combos[ein,])
      print(p)
}

print(Sys.time() - start.time)

Response surfaces

Single org

Test Run (no optimization)

Setup

All orgs

Single Organization (TEST)